load("~/Documents/Nonparametric Statisics/Project/clean data/full_collisions.RData")
glimpse(full_collisions)
## Rows: 2,585,717
## Columns: 37
## $ accident_index                              <chr> "200501BS00001", "200501BS…
## $ accident_year                               <dbl> 2005, 2005, 2005, 2005, 20…
## $ accident_reference                          <chr> "01BS00001", "01BS00002", …
## $ location_easting_osgr                       <dbl> 525680, 524170, 524520, 52…
## $ location_northing_osgr                      <dbl> 178240, 181650, 182240, 17…
## $ longitude                                   <dbl> -0.191170, -0.211708, -0.2…
## $ latitude                                    <dbl> 51.48910, 51.52007, 51.525…
## $ police_force                                <fct> Metropolitan Police, Metro…
## $ accident_severity                           <fct> Serious, Slight, Slight, S…
## $ number_of_vehicles                          <dbl> 1, 1, 2, 1, 1, 2, 2, 1, 2,…
## $ number_of_casualties                        <dbl> 1, 1, 1, 1, 1, 1, 1, 2, 2,…
## $ date                                        <date> 2005-01-04, 2005-01-05, 2…
## $ day_of_week                                 <fct> Tuesday, Wednesday, Thursd…
## $ time                                        <time> 17:42:00, 17:36:00, 00:15…
## $ local_authority_district                    <fct> "Kensington and Chelsea", …
## $ local_authority_ons_district                <chr> "E09000020", "E09000020", …
## $ local_authority_highway                     <chr> "E09000020", "E09000020", …
## $ first_road_class                            <fct> A, B, C, A, Unclassified, …
## $ first_road_number                           <dbl> 3218, 450, 0, 3220, 0, 0, …
## $ road_type                                   <fct> Single carriageway, Dual c…
## $ speed_limit                                 <dbl> 30, 30, 30, 30, 30, 30, 30…
## $ junction_detail                             <fct> Not at junction or within …
## $ junction_control                            <fct> Data missing or out of ran…
## $ second_road_class                           <fct> Not at junction or within …
## $ second_road_number                          <dbl> -1, 0, -1, -1, -1, -1, 0, …
## $ pedestrian_crossing_human_control           <fct> None within 50 metres, Non…
## $ pedestrian_crossing_physical_facilities     <fct> "Zebra", "Pedestrian phase…
## $ light_conditions                            <fct> Daylight, Darkness - light…
## $ weather_conditions                          <fct> Raining no high winds, Fin…
## $ road_surface_conditions                     <fct> Wet or damp, Dry, Dry, Dry…
## $ special_conditions_at_site                  <fct> None, None, None, None, No…
## $ carriageway_hazards                         <fct> None, None, None, None, No…
## $ urban_or_rural_area                         <fct> Urban, Urban, Urban, Urban…
## $ did_police_officer_attend_scene_of_accident <fct> Yes, Yes, Yes, Yes, Yes, Y…
## $ trunk_road_flag                             <fct> Non-trunk, Non-trunk, Non-…
## $ lsoa_of_accident_location                   <chr> "E01002849", "E01002909", …
## $ datetime                                    <dttm> 2005-01-04 17:42:00, 2005…

doing some EDA with a focus on severity:

0.1 severity

full_collisions %>% ggplot(aes(accident_severity)) + geom_bar()

## year

full_collisions %>% ggplot(aes(accident_year)) + geom_bar(aes(fill = accident_severity))

fixing the scale to see the proportions:

full_collisions %>% ggplot(aes(y = accident_year, fill = accident_severity)) + 
  geom_bar(position = "fill")

the composition seems to change over the years, more serious than slight

0.2 time

full_collisions %>% ggplot(aes(time,color = accident_severity)) + geom_freqpoly(bins = 48)

full_collisions %>%  filter(accident_severity == "Fatal") %>% ggplot(aes(time,color = accident_severity)) + geom_freqpoly(bins = 48)

same pattern for this.

full_collisions %>% ggplot(aes(x = time, fill = accident_severity)) + 
  geom_histogram(bins = 48,position = "fill")

this could be interesting as well, in the night the severity increases.

0.3 day of the week

full_collisions$day_of_week <- factor(full_collisions$day_of_week,levels = c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"),ordered = T)

full_collisions %>% ggplot(aes(day_of_week)) + geom_bar(aes(fill = accident_severity))

full_collisions %>% ggplot(aes(x = day_of_week, fill = accident_severity)) + 
  geom_bar(position = "fill")

the severity changes

0.4 date

full_collisions %>% mutate(n_accidents = n(),day = yday(date)) %>%  ggplot(aes(day,color = as.factor(accident_year))) + geom_freqpoly(bins = 366) + theme(legend.position = "none")

we can see the covid years:

clearly:

full_collisions %>% filter(accident_year > 2018) %>% mutate(n_accidents = n(),day = yday(date)) %>%  ggplot(aes(day,color = as.factor(accident_year))) + geom_freqpoly(bins = 366) + theme(legend.position = "top")

we can try to look for a trend in the day of the month:

full_collisions %>% mutate(n_accidents = n(),day = mday(date)) %>%  ggplot(aes(day,color = as.factor(accident_year))) + geom_freqpoly(bins = 31) + theme(legend.position = "none")

nothing

0.5 number of vehicles:

full_collisions %>% ggplot(aes(number_of_vehicles)) + geom_bar(aes(fill = accident_severity)) + xlim(c(1,10))
## Warning: Removed 142 rows containing non-finite values (`stat_count()`).
## Warning: Removed 6 rows containing missing values (`geom_bar()`).

full_collisions %>% ggplot(aes(x = number_of_vehicles, fill = accident_severity)) + 
  geom_bar(position = "fill") + xlim(c(1,10))
## Warning: Removed 142 rows containing non-finite values (`stat_count()`).
## Warning: Removed 6 rows containing missing values (`geom_bar()`).

0.6 number of casuallties:

full_collisions %>% ggplot(aes(number_of_casualties)) + geom_bar(aes(fill = accident_severity)) + xlim(c(1,10))
## Warning: Removed 529 rows containing non-finite values (`stat_count()`).
## Warning: Removed 6 rows containing missing values (`geom_bar()`).

full_collisions %>% ggplot(aes(x = number_of_casualties, fill = accident_severity)) + 
  geom_bar(position = "fill") + xlim(c(1,10))
## Warning: Removed 529 rows containing non-finite values (`stat_count()`).
## Warning: Removed 6 rows containing missing values (`geom_bar()`).

0.7 first road class:

full_collisions %>% ggplot(aes(first_road_class)) + geom_bar(aes(fill = accident_severity)) 

full_collisions %>% ggplot(aes(x = first_road_class, fill = accident_severity)) + 
  geom_bar(position = "fill")

0.8 second roadd class

full_collisions %>% ggplot(aes(second_road_class)) + geom_bar(aes(fill = accident_severity)) + scale_x_discrete(guide = guide_axis(angle = 90))

full_collisions %>% ggplot(aes(x = second_road_class, fill = accident_severity)) + 
  geom_bar(position = "fill") + scale_x_discrete(guide = guide_axis(angle = 90))

0.9 road type

full_collisions %>% ggplot(aes(road_type)) + geom_bar(aes(fill = accident_severity)) + scale_x_discrete(guide = guide_axis(angle = 90))

full_collisions %>% ggplot(aes(x = road_type, fill = accident_severity)) + 
  geom_bar(position = "fill") + scale_x_discrete(guide = guide_axis(angle = 90))

0.10 speed limit

full_collisions %>% ggplot(aes(as.factor(speed_limit))) + geom_bar(aes(fill = accident_severity)) 

full_collisions %>% ggplot(aes(x = as.factor(speed_limit), fill = accident_severity)) + 
  geom_bar(position = "fill") 

0.11 junction detail

full_collisions %>% ggplot(aes(junction_detail)) + geom_bar(aes(fill = accident_severity)) + scale_x_discrete(guide = guide_axis(angle = 90))

full_collisions %>% ggplot(aes(x = junction_detail, fill = accident_severity)) + 
  geom_bar(position = "fill") + scale_x_discrete(guide = guide_axis(angle = 90))

0.12 junction control

full_collisions %>% ggplot(aes(junction_control)) + geom_bar(aes(fill = accident_severity)) + scale_x_discrete(guide = guide_axis(angle = 90))

full_collisions %>% ggplot(aes(x = junction_control, fill = accident_severity)) + 
  geom_bar(position = "fill") + scale_x_discrete(guide = guide_axis(angle = 90))

0.13 pedestrian_crossing_human_control

p1 <- full_collisions %>% ggplot(aes(pedestrian_crossing_human_control)) + 
  geom_bar(aes(fill = accident_severity)) + scale_x_discrete(guide = guide_axis(angle = 60)) +
  theme(legend.position = "none")

p2 <- full_collisions %>% ggplot(aes(x = pedestrian_crossing_human_control, fill = accident_severity)) + 
  geom_bar(position = "fill") + scale_x_discrete(guide = guide_axis(angle = 60)) +
  theme(legend.position = "none")
p1 + p2

0.14 pedestrian_crossing_physical_facilities

p1 <- full_collisions %>% ggplot(aes(pedestrian_crossing_human_control)) + 
  geom_bar(aes(fill = accident_severity)) + scale_x_discrete(guide = guide_axis(angle = 60)) +
  theme(legend.position = "none")

p2 <- full_collisions %>% ggplot(aes(x = pedestrian_crossing_human_control, fill = accident_severity)) + 
  geom_bar(position = "fill") + scale_x_discrete(guide = guide_axis(angle = 60)) +
  theme(legend.position = "none")
p1 + p2

0.15 light_conditions

p1 <- full_collisions %>% ggplot(aes(light_conditions)) + 
  geom_bar(aes(fill = accident_severity)) + scale_x_discrete(guide = guide_axis(angle = 60)) +
  theme(legend.position = "none")

p2 <- full_collisions %>% ggplot(aes(x = light_conditions, fill = accident_severity)) + 
  geom_bar(position = "fill") + scale_x_discrete(guide = guide_axis(angle = 60)) +
  theme(legend.position = "none")
p1 + p2

0.16 weather_conditions

p1 <- full_collisions %>% ggplot(aes(weather_conditions)) + 
  geom_bar(aes(fill = accident_severity)) + scale_x_discrete(guide = guide_axis(angle = 60)) +
  theme(legend.position = "none")

p2 <- full_collisions %>% ggplot(aes(x = weather_conditions, fill = accident_severity)) + 
  geom_bar(position = "fill") + scale_x_discrete(guide = guide_axis(angle = 60)) +
  theme(legend.position = "none")
p1 + p2

0.17 special_conditions_at_site

p1 <- full_collisions %>% ggplot(aes(special_conditions_at_site)) + 
  geom_bar(aes(fill = accident_severity)) + scale_x_discrete(guide = guide_axis(angle = 60)) +
  theme(legend.position = "none")

p2 <- full_collisions %>% ggplot(aes(x = special_conditions_at_site, fill = accident_severity)) + 
  geom_bar(position = "fill") + scale_x_discrete(guide = guide_axis(angle = 60)) +
  theme(legend.position = "none")
p1 + p2

0.18 carriageway_hazards

p1 <- full_collisions %>% ggplot(aes(carriageway_hazards)) + 
  geom_bar(aes(fill = accident_severity)) + scale_x_discrete(guide = guide_axis(angle = 60)) +
  theme(legend.position = "none")

p2 <- full_collisions %>% ggplot(aes(x = carriageway_hazards, fill = accident_severity)) + 
  geom_bar(position = "fill") + scale_x_discrete(guide = guide_axis(angle = 60)) +
  theme(legend.position = "none")
p1 + p2

0.19 did_police_officer_attend_scene_of_accident

p1 <- full_collisions %>% ggplot(aes(did_police_officer_attend_scene_of_accident)) + 
  geom_bar(aes(fill = accident_severity)) + scale_x_discrete(guide = guide_axis(angle = 45)) +
  theme(legend.position = "none")

p2 <- full_collisions %>% ggplot(aes(x = did_police_officer_attend_scene_of_accident, fill = accident_severity)) + 
  geom_bar(position = "fill") + scale_x_discrete(guide = guide_axis(angle = 45)) +
  theme(legend.position = "none")
p1 + p2

0.20 trunk_road_flag

p1 <- full_collisions %>% ggplot(aes(trunk_road_flag)) + 
  geom_bar(aes(fill = accident_severity)) + scale_x_discrete(guide = guide_axis(angle = 45)) +
  theme(legend.position = "none")

p2 <- full_collisions %>% ggplot(aes(x = trunk_road_flag, fill = accident_severity)) + 
  geom_bar(position = "fill") + scale_x_discrete(guide = guide_axis(angle = 45)) +
  theme(legend.position = "none")
p1 + p2